#### ---- SCRIPT 07: The purpose of this script is to test the ideological balance of guests on the seven selected ---- ####
#### ----programmes using their Twitter/X ideal points. Two tests are conducted (Mann-Whitney U and Hartigan's Dip) ---- ####
#### ---- along with a descriptive raincloud plot. Additional checks are then conducted, asssessing how these results ---- ####
#### ---- vary based on whether missing user ideal points are imputed using organisational proxies, with MPs removed ---- ####
#### ---- and when splitting the programme episodes into quarterly time intervals.  ---- ####
 
options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # big data wrangling
library(ggridges) # generate ridge plots
library(tibble) # data structuring
library(tidyr) # data tidying
library(diptest) # Hartigan's dip test
library(lubridate) # working with dates
library(multcompView) # visualisations of paired comparisons

#### ---- DIRECTORIES ---- ####

raw_dir <- "00-raw_data/"
processed_dir <- "01-processed_data/"
figure_dir <- "02-figures/"

#### ---- LOAD DATA ---- ####

# Twitter user data 
user_data <- fread(paste0(processed_dir,"user_ideal_point_data_updated.csv"))

# Guest masterlist
guest_data <- fread(paste0(processed_dir,"username_master_list_with_ideal_points.csv"))

# TV guest lists
guest_lists <- list.files(paste0(raw_dir,"guest_lists/"),full.names = TRUE) %>%
  lapply(fread)

#### ---- MERGE THE GUEST LISTS TOGETHER AND MATCH TO THEIR IDEAL POINTS ---- ####

# Concatenate the individual guest lists together into one dataset
guest_lists_merged <- rbindlist(guest_lists)

# Match each guest appearance in the dataset to their corresponding ideal point and the ideal point of their
# organisation and organisation affiliates. These will only be used where an individual does not have an ideal point
# of their own.
guest_lists_merged <- left_join(guest_lists_merged,guest_data,by=c("Name" = "name"))

##### ---- TV SHOW IDEOLOGICAL REPRESENTATION ANALYSIS ---- ####

# Where a user does not have an individual ideal point, replace with their organisation's ideal point or the ideal point of their
# organisation's mean ideal point of affiliates
guest_lists_merged$final_ideal_point <- ifelse(is.na(guest_lists_merged$user_ideal_point),
                                               ifelse(is.na(guest_lists_merged$organisation_ideal_point),
                                                      guest_lists_merged$affiliate_mean_ideal_point,
                                                      guest_lists_merged$organisation_ideal_point),guest_lists_merged$user_ideal_point)

# Add all ordinary and elite Twitter users to the dataset to allow for comparisons with these groups alongside the T.V show
# distributions
twitter_ideal_points <- user_data %>% 
  select(user_ideal_point,elite_account) %>%
  rename(Show = elite_account)

guest_lists_merged <- rbind(guest_lists_merged,twitter_ideal_points,fill=TRUE)

# Summary table 
median(user_data$user_ideal_point, na.rm = T)
median(guest_data$user_ideal_point, na.rm = T)

ideal_point_summary <- guest_lists_merged %>% 
  group_by(Show) %>%
  filter(!is.na(user_ideal_point)) %>%
  summarise(count = n(),
            median_ideal = round(median(user_ideal_point,na.rm = T),2),
            mean_ideal = round(mean(user_ideal_point,na.rm = T),2),
            sd_ideal = round(sd(user_ideal_point,na.rm = T),2),
            min_ideal = round(min(user_ideal_point,na.rm = T),2),
            max_ideal = round(max(user_ideal_point,na.rm = T),2))

# Order shows by data availability for benefit of plot interpretability
shows_ideal_point_plot_ordering <- guest_lists_merged %>% 
  group_by(Show) %>%
  summarise(median_ideal_point = median(user_ideal_point,na.rm=T)) %>%
  arrange(desc(median_ideal_point)) %>%
  pull(Show)

guest_lists_merged$Show <- factor(guest_lists_merged$Show , levels = shows_ideal_point_plot_ordering)

# Raincloud distribution plot
ideal_point_distributions_by_show <- ggplot(guest_lists_merged, aes(x = user_ideal_point, y = Show, fill = Show)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.1, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  labs(y = "", x = "Left-Right Guest Position") +
  theme(axis.title.y = element_blank(), 
        legend.position = "none")

ggsave(paste0(figure_dir, "ideal_point_by_show_raincloud_plot.png"),
       ideal_point_distributions_by_show, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

##### ---- STATISTICAL TESTS ---- ####

# ANOVA test
anova_result <- aov(user_ideal_point ~ Show, data = guest_lists_merged)
tukey_result <- TukeyHSD(anova_result)

# Tidy the Tukey result
tukey_df <- broom::tidy(tukey_result)

# Sort by absolute value of mean difference
tukey_df <- tukey_df[order(tukey_df$estimate), ]

# Convert contrast to a factor with levels in the order of estimate
tukey_df$contrast <- factor(tukey_df$contrast, levels = tukey_df$contrast)

# Plot
tukey_plot <- ggplot(tukey_df, aes(x = contrast, y = estimate)) +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(x = "Comparison",
       y = "Difference in Mean Guest Left-Right Position") +
  theme_minimal()

ggsave(paste0(figure_dir,"ideal_point_by_show_pairwise_tukey_comparisons.png"),
       tukey_plot, 
       units="in", width=8, height=8, dpi=300,
       bg="white")

# Hartigan's Dip Test (for multimodality)

x <- dip.test(guest_lists_merged$user_ideal_point,
              simulate.p.value = FALSE)

dip_test_stats <- data.frame()

# Iterate through the dip test results and store in readable data frame
for (show in shows_ideal_point_plot_ordering){
  
  show_data <- guest_lists_merged %>%
    filter(Show == show)
  
  dip_stats <- dip.test(show_data$user_ideal_point,
                        simulate.p.value = FALSE)
  
  show_dip_stats_df <- data.frame(show = show,
                                  count = nrow(show_data[!is.na(user_ideal_point),]),
                                  mean_ideal = mean(show_data$user_ideal_point,na.rm = T),
                                  median_ideal= median(show_data$user_ideal_point,na.rm = T),
                                  s.d = sd(show_data$user_ideal_point,na.rm = T),
                                  min = min(show_data$user_ideal_point,na.rm = T),
                                  max = max(show_data$user_ideal_point,na.rm = T),
                                  dip_stat = dip_stats$statistic,
                                  p_value = dip_stats$p.value)
  
  dip_test_stats <- rbind(dip_test_stats,show_dip_stats_df)
}  

print(dip_test_stats)

#### ---- Time-series Checks ---- ####

# Filter out non-programme data 
guest_lists_merged <- guest_lists_merged %>% filter(Show != "Ordinary Account" & Show != "Elite Account")

# Group dates into quarters 
guest_lists_merged$Date <- dmy(guest_lists_merged$Date)

guest_lists_merged$date_interval <- factor(lubridate::quarter(guest_lists_merged$Date, type="year.quarter"))

guest_lists_merged <- guest_lists_merged %>% 
  mutate(date_interval = case_when(
    date_interval == "2022.1" ~ "Jan-Mar 22",
    date_interval == "2022.2" ~ "Apr-Jun 22",
    date_interval == "2022.3" ~ "Jul-Sept 22",
    date_interval == "2022.4" ~ "Oct-Dec 22",
    date_interval == "2023.1" ~ "Jan-Mar 23",
    date_interval == "2023.2" ~ "Apr-Jun 23",
    date_interval == "2023.3" ~ "Jul-Sept 23",
    date_interval == "2023.4" ~ "Oct-Dec 23",
    TRUE ~ date_interval
  ))

guest_lists_merged$date_interval <- factor(guest_lists_merged$date_interval,
                                           levels = c("Oct-Dec 23","Jul-Sept 23","Apr-Jun 23","Jan-Mar 23",
                                                      "Oct-Dec 22","Jul-Sept 22","Apr-Jun 22","Jan-Mar 22"))
                                             
# Create an "Overall" group by copying the overall data and binding back to original dataframe
guest_lists_merged_total <- guest_lists_merged
guest_lists_merged_total$Show <- "Overall"

guest_lists_merged <- rbind(guest_lists_merged,guest_lists_merged_total)

# Plot
time_series_ideal_plot <- ggplot(guest_lists_merged,aes(x=user_ideal_point,y=date_interval)) +
  geom_density_ridges(
    scale = 1, 
    alpha = 0.7,
    bandwidth = 0.1, 
    rel_min_height = 0.01) + 
  geom_boxplot(
    width = .3, 
    outlier.shape = NA, 
    alpha = 0.7
  ) +
  theme_minimal() +
  labs(y = "", x = "Left-Right Guest Position") +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 8)) +
  facet_wrap(~Show, scales = "fixed",ncol = 4)

ggsave(paste0(figure_dir,"ideal_point_time_series.png"),
       time_series_ideal_plot, 
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Tests for statistical significance between time points

# Median points
time_median_test <- guest_lists_merged %>%
  group_by(date_interval,Show) %>%
  summarise(count = n(),
            mean_ideal = mean(user_ideal_point,na.rm=T),
            median_ideal = median(user_ideal_point,na.rm=T))

guest_lists_merged_time <- guest_lists_merged[Show == "Overall",]

# ANOVA test
anova_result_time <- aov(user_ideal_point ~ date_interval, data = guest_lists_merged_time)
tukey_result_time <- TukeyHSD(anova_result_time)

# Tidy the Tukey result
tukey_time_df <- broom::tidy(tukey_result_time)

# Sort by absolute value of mean difference
tukey_df <- tukey_df[order(tukey_df$estimate), ]

#### ---- END ---- ####
